Data cleaning

path = "data/2017"
temp <- list.files(path=path)
data1 <- read_excel(paste(path,temp[1], sep="/"), sheet = "Student", skip = 1)
data2 <- read_csv(paste(path,temp[2], sep="/"))
tag <- data2 %>% 
  select(`ID`, `UoA ID`, `UoA UPI`, `Tags`)
head(tag,5)
## # A tibble: 5 x 4
##         ID `UoA ID`  `UoA UPI` Tags                                        
##      <int> <chr>     <chr>     <chr>                                       
## 1   1.80e8 EXTERNAL  EXTERNAL  CIE Newsletter,2016 MCE Student,2016 CIE Pa~
## 2   1.80e8 160160291 CSHE674   CIE Newsletter,2016 MCE Student,2016 CIE Pa~
## 3   1.80e8 2667562   ADEK841   Unleash Space,2017 Create and Make Workshop~
## 4   1.80e8 4099538   EERS001   CIE Newsletter,2016 MCE Student,2016 CIE Pa~
## 5   1.80e8 4517867   COSU006   CIE Newsletter,2016 MCE Student,Weekly News~
# Participant programme
partProg <- tag$Tags %>% 
  strsplit(., ",") %>% 
  setNames(tag$`UoA ID`) %>% 
  melt(value.name = "programme")

colnames(partProg) <- c("programme", "ID")

head(partProg,5)
##                        programme       ID
## 1                 CIE Newsletter EXTERNAL
## 2               2016 MCE Student EXTERNAL
## 3           2016 CIE Participant EXTERNAL
## 4 MCE Current Student and Alumni EXTERNAL
## 5           2017 CIE Participant EXTERNAL
#TODO: Active in Programme?
partInfo<- data1 %>% 
  select(`ID`, `Acad Prog`, `Status`, `Descriptio`, `Acad Plan`, `Plan Description`, `Owner of Major/Spec/Module`)

head(partInfo,5)
## # A tibble: 5 x 7
##   ID    `Acad Prog` Status Descriptio `Acad Plan` `Plan Descripti~
##   <chr> <chr>       <chr>  <chr>      <chr>       <chr>           
## 1 5695~ BABS        Activ~ Bachelor ~ PSYC-BSBA   Psychology (BSc)
## 2 2198~ BCOM        Activ~ Bachelor ~ INEN-BCOM   Innovation & En~
## 3 5129~ BSC         Activ~ Bachelor ~ BIOMED-BSC  Biomedical Scie~
## 4 8075~ BA          Activ~ Bachelor ~ POIR-BAMAJ  Politics and In~
## 5 9675~ BCOM        Activ~ Bachelor ~ MGT-BCOM    Management      
## # ... with 1 more variable: `Owner of Major/Spec/Module` <chr>
# Outer join two tables
df <- merge(x=partProg, y=partInfo, by="ID", all.x = TRUE)
head(df,5)
##        ID                                                 programme
## 1 1000280                                 2017 Velocity Participant
## 2 1000280                                      2017 CIE Participant
## 3 1000280            2017 Velocity Innovation Challenge Participant
## 4 1000280                                                  Velocity
## 5 1000280 2017 Velocity Innovation Challenge Commercial Participant
##   Acad Prog Status Descriptio Acad Plan Plan Description
## 1      <NA>   <NA>       <NA>      <NA>             <NA>
## 2      <NA>   <NA>       <NA>      <NA>             <NA>
## 3      <NA>   <NA>       <NA>      <NA>             <NA>
## 4      <NA>   <NA>       <NA>      <NA>             <NA>
## 5      <NA>   <NA>       <NA>      <NA>             <NA>
##   Owner of Major/Spec/Module
## 1                       <NA>
## 2                       <NA>
## 3                       <NA>
## 4                       <NA>
## 5                       <NA>
# Filter out non-students
df_stud <- df %>% 
  filter(!is.na(`Acad Prog`))

head(df_stud,10)
##           ID
## 1  101253805
## 2  101253805
## 3  101253805
## 4  101253805
## 5  101253805
## 6  101253805
## 7  101253805
## 8  101253805
## 9  101253805
## 10 101253805
##                                                                    programme
## 1  2018 Create and Make International Women's Day Maker Workshop Participant
## 2                                  2018 Velocity $100k Challenge Participant
## 3                                  2017 Create and Make Workshop Participant
## 4                                                       2018 CIE Participant
## 5                                  2018 Create and Make Workshop Participant
## 6                                             2017 Unleash Space Participant
## 7                                             2018 Idea Bootcamp Participant
## 8                         2018 Unleash Space International Women's Day Lunch
## 9                                                       Velocity Participant
## 10                                             2018 Velocity I2B Participant
##    Acad Prog              Status                 Descriptio Acad Plan
## 1      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 2      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 3      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 4      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 5      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 6      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 7      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 8      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 9      COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 10     COPUA Active in Programme Certificate of Proficiency SCI-COPUA
##    Plan Description Owner of Major/Spec/Module
## 1           Science                    Science
## 2           Science                    Science
## 3           Science                    Science
## 4           Science                    Science
## 5           Science                    Science
## 6           Science                    Science
## 7           Science                    Science
## 8           Science                    Science
## 9           Science                    Science
## 10          Science                    Science
# Only select specific year
df_stud <- df_stud %>% 
  mutate(year=substring(`programme`,0,4), programme=substring(`programme`,6)) %>% 
  filter(year=="2017")

head(df_stud,5)
##          ID                                    programme Acad Prog
## 1 101253805         Create and Make Workshop Participant     COPUA
## 2 101253805                    Unleash Space Participant     COPUA
## 3 101253805                         Unleash Space Access     COPUA
## 4 101253805 Create and Make Arduino Workshop Participant     COPUA
## 5 101253805   Create and Make Space Workshop Participant     COPUA
##                Status                 Descriptio Acad Plan
## 1 Active in Programme Certificate of Proficiency SCI-COPUA
## 2 Active in Programme Certificate of Proficiency SCI-COPUA
## 3 Active in Programme Certificate of Proficiency SCI-COPUA
## 4 Active in Programme Certificate of Proficiency SCI-COPUA
## 5 Active in Programme Certificate of Proficiency SCI-COPUA
##   Plan Description Owner of Major/Spec/Module year
## 1          Science                    Science 2017
## 2          Science                    Science 2017
## 3          Science                    Science 2017
## 4          Science                    Science 2017
## 5          Science                    Science 2017
df_stud %>% 
  group_by(`Owner of Major/Spec/Module`) %>% 
  summarise(count=n()) %>% 
  ggplot() +
  geom_bar(aes(x=reorder(`Owner of Major/Spec/Module`, count), count, fill=count), stat="identity") +
  guides(fill=FALSE) +
  coord_flip() +
  ggtitle("Faculty split overall") +
  theme_hc() + 
  scale_fill_continuous_tableau() + labs(x="")

# Lollipop chart
df_stud %>% 
  group_by(`programme`) %>% 
  summarise(count=n()) %>% 
  filter(!`programme` %in% c("CIE Participant", "Velocity Participant", "Unleash Space Participant", "Unleash Space Access", "Equipment Training Participant" )) %>% 
  arrange(count) %>% 
  mutate(programme = factor(`programme`, levels=.$programme)) %>% 
  ggplot(aes(count, programme, label=count, fill=count, color=-count)) +
  geom_segment(aes(x=0, y=programme, xend=count, yend=programme)) +
  geom_point(size=5) +
  guides(fill=FALSE, color=FALSE) +
  ggtitle("Programme split overall") +
  theme_minimal() + 
  scale_fill_continuous_tableau() + labs(x="", y="")

  #geom_text(nudge_x=10)

# Bar chart
df_stud %>% 
  group_by(`programme`) %>% 
  summarise(count=n()) %>% 
  filter(!`programme` %in% c("CIE Participant", "Velocity Participant", "Unleash Space Participant", "Unleash Space Access", "Equipment Training Participant" )) %>% 
  arrange(count) %>% 
  mutate(programme = factor(`programme`, levels=.$programme)) %>% 
  ggplot(aes(programme, count, label=count, fill=count)) +
  geom_bar(stat="identity") +
  coord_flip() +
  guides(fill=FALSE, color=FALSE) +
  ggtitle("Programme split overall") +
  theme_minimal() + 
  scale_fill_continuous_tableau() + labs(x="", y="")

# Create dataframe for heatmap
df_stud %>% 
  select(`programme`, `Owner of Major/Spec/Module`) %>% 
  filter(!`programme` %in% c("CIE Participant", "Velocity Participant", "Unleash Space Participant", "Unleash Space Access", "Equipment Training Participant" )) %>%
  group_by(`programme`,`Owner of Major/Spec/Module`) %>% 
  summarise(count=n()) %>%
  complete(`Owner of Major/Spec/Module` = unique(df_stud$`Owner of Major/Spec/Module`)) %>% 
  ggplot(aes(`Owner of Major/Spec/Module`, `programme`)) + 
  geom_tile(aes(fill=count), colour="grey97") +
  guides(color=FALSE) +
  ggtitle("Programme split overall") +
  scale_fill_gradient_tableau(na.value = "white") +
  scale_x_discrete(position="top") +
  #scale_fill_gradient(low="white", high = "steelblue", na.value="grey80") +
  coord_fixed(ratio=.25) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=0)) +
  labs(x="", y="")